Deng,N. et al. (2013) Detecting Splicing Variants in Idiopathic Pulmonary Fibrosis from Non-Differentially Expressed Genes. PLoS One, 8, e68352.
library(rats)
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 3.4.3
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.2
library(data.table)
library(ggplot2)
library(matrixStats)
library(assertthat)
#basedir <- "/homes/kfroussios/PROJECTS/rats"
basedir <- "/Volumes/kfroussios/PROJECTS/rats"
rat_ipf_60 <- readRDS(file.path(basedir,"DE/rats_human_60.RDS"))
rat_ipf_87 <- readRDS(file.path(basedir,"DE/rats_human_87.RDS"))
print( rat_ipf_60$Parameters[c('rats_version', 'R_version', 'abund_scaling', 'abund_thresh', 'p_thresh',
'dprop_thresh', 'quant_boot', 'quant_reprod_thresh', 'quant_bootnum',
'rep_boot', 'rep_reprod_thresh')] )
## $rats_version
## [1] '0.6.2'
##
## $R_version
## $R_version$platform
## [1] "x86_64-redhat-linux-gnu"
##
## $R_version$version.string
## [1] "R version 3.3.3 (2017-03-06)"
##
##
## $abund_scaling
## [1] 25
##
## $abund_thresh
## [1] 0
##
## $p_thresh
## [1] 0.05
##
## $dprop_thresh
## [1] 0.2
##
## $quant_boot
## [1] TRUE
##
## $quant_reprod_thresh
## [1] 0.95
##
## $quant_bootnum
## [1] 1000
##
## $rep_boot
## [1] TRUE
##
## $rep_reprod_thresh
## [1] 0
print( rat_ipf_87$Parameters[c('rats_version', 'R_version', 'abund_scaling', 'abund_thresh', 'p_thresh',
'dprop_thresh', 'quant_boot', 'quant_reprod_thresh', 'quant_bootnum',
'rep_boot', 'rep_reprod_thresh')] )
## $rats_version
## [1] '0.6.2'
##
## $R_version
## $R_version$platform
## [1] "x86_64-redhat-linux-gnu"
##
## $R_version$version.string
## [1] "R version 3.3.3 (2017-03-06)"
##
##
## $abund_scaling
## [1] 25
##
## $abund_thresh
## [1] 0
##
## $p_thresh
## [1] 0.05
##
## $dprop_thresh
## [1] 0.2
##
## $quant_boot
## [1] TRUE
##
## $quant_reprod_thresh
## [1] 0.95
##
## $quant_bootnum
## [1] 1000
##
## $rep_boot
## [1] TRUE
##
## $rep_reprod_thresh
## [1] 0
# and assert that the thresholds used were the same between the annotations
print( are_equal(rat_ipf_60$Parameters$p_thresh, rat_ipf_87$Parameters$p_thresh) )
## [1] TRUE
pthresh <- rat_ipf_60$Parameters$p_thresh
print( are_equal(rat_ipf_60$Parameters$dprop_thresh, rat_ipf_87$Parameters$dprop_thresh) )
## [1] TRUE
dythresh <- rat_ipf_60$Parameters$dprop_thresh
print( are_equal(rat_ipf_60$Parameters$quant_reprod_thresh, rat_ipf_87$Parameters$quant_reprod_thresh) )
## [1] TRUE
repthresh <- rat_ipf_60$Parameters$quant_reprod_thresh
print( are_equal(rat_ipf_60$Parameters$abund_thresh, rat_ipf_87$Parameters$abund_thresh) )
## [1] TRUE
print( are_equal(rat_ipf_60$Parameters$rep_reprod_thresh, rat_ipf_87$Parameters$rep_reprod_thresh) )
## [1] TRUE
deds <- as.data.table(read.csv(file.path(basedir,"supplementary/Deng2013_DE+DS.txt"), header = TRUE, sep = "\t"))
names(deds) <- c("Gene", "DE_fold", "DS_p", "DS_fdr", "_lnfdr", "Region")
setkey(deds, Gene)
dge <- as.data.table(read.csv(file.path(basedir,"supplementary/Deng2013_DGE.txt"), header = TRUE, sep = "\t"))
names(dge) <- c("Gene", "fold", "p", "fdr", "A", "B")
dge[, A:= NULL]
dge[, B:= NULL]
setkey(dge, Gene)
dte <- as.data.table(read.csv(file.path(basedir,"supplementary/Deng2013_DTE.txt"), header = TRUE, sep = "\t"))
names(dte) <- c("target_id", "Gene", "parent_id", "fold", "p", "fdr")
setkey(dte, Gene)
gn2gid <- unique(dte[, .(Gene, parent_id)])
setkey(gn2gid, Gene)
suppa60 <- as.data.table(read.delim(file.path(basedir,"DE/suppa60.tsv.dpsi.temp.0"), header = TRUE))
suppa87 <- as.data.table(read.delim(file.path(basedir,"DE/suppa87.tsv.dpsi.temp.0"), header = TRUE))
names(suppa60) <- c("event","dpsi","pval")
names(suppa87) <- c("event","dpsi","pval")
# Split SUPPA event IDs into gene ID and transcript ID
suppa60[, gene_id := sapply(as.character(suppa60$event), function (x) strsplit(x, ";")[[1]][1])]
suppa60[, transc_id := sapply(as.character(suppa60$event), function (x) strsplit(x, ";")[[1]][2])]
setkey(suppa60, gene_id)
suppa87[, gene_id := sapply(as.character(suppa87$event), function (x) strsplit(x, ";")[[1]][1])]
suppa87[, transc_id := sapply(as.character(suppa87$event), function (x) strsplit(x, ";")[[1]][2])]
setkey(suppa87, gene_id)
suppa60[, DTU := (pval < pthresh & abs(dpsi) >= dythresh)]
suppa87[, DTU := (pval < pthresh & abs(dpsi) >= dythresh)]
drim60 <- as.data.table(read.table(file.path(basedir, 'DE/drimseq60.results.tsv'), header=TRUE))
setkey(drim60,gene_id)
drim87 <- as.data.table(read.table(file.path(basedir, 'DE/drimseq87.results.tsv'), header=TRUE))
setkey(drim87,gene_id)
drim60[, DTU := adj_pvalue < pthresh]
drim87[, DTU := adj_pvalue < pthresh]
… so that I don’t clutter every new plot command.
gth <- theme(axis.line.x= element_line(),
axis.line.y= element_line(),
strip.background= element_rect(fill= "grey95"),
strip.text.y= element_text(size= rel(1.2)),
strip.text.x= element_text(size= rel(1.1)),
panel.grid.major.x= element_blank(),
panel.grid.major.y= element_line(colour='grey90'),
panel.grid.minor= element_blank(),
panel.background= element_rect(fill = "white"),
legend.key = element_rect(fill = 'white')
)
# Overlap between annotations.
refcom <- intersect(rat_ipf_60$Genes$parent_id, rat_ipf_87$Genes$parent_id)
print( length(refcom) )
## [1] 42212
# Gene IDs unique to the new annotation, not present in the old.
gained <- setdiff(rat_ipf_87$Genes$parent_id, rat_ipf_60$Genes$parent_id)
print( length(gained) )
## [1] 15839
# Gene IDs unique to the old annoation, not present in the new.
lost <- setdiff(rat_ipf_60$Genes$parent_id, rat_ipf_87$Genes$parent_id)
print( length(lost) )
## [1] 10253
# Ensembl 60
print( dtu_summary(rat_ipf_60) )
## category tally
## 1 DTU genes (gene test) 673
## 2 non-DTU genes (gene test) 17822
## 3 ineligible genes (gene test) 33970
## 4 DTU genes (transc. test) 553
## 5 non-DTU genes (transc. test) 18508
## 6 ineligible genes (transc. test) 33404
## 7 DTU genes (both tests) 534
## 8 non-DTU genes (both tests) 17803
## 9 ineligible genes (both tests) 33970
## 10 DTU transcripts 772
## 11 non-DTU transcripts 120718
## 12 ineligible transcripts 35990
print( dtu_switch_summary(rat_ipf_60) )
## category genes
## 1 Primary switch (gene test) 485
## 2 Non-primary switch (gene test) 506
## 3 Primary switch (transc. test) 399
## 4 Non-primary switch (transc. test) 404
## 5 Primary switch (both tests) 383
## 6 Non-primary switch (both tests) 386
# Ensembl 87
print( dtu_summary(rat_ipf_87) )
## category tally
## 1 DTU genes (gene test) 817
## 2 non-DTU genes (gene test) 19628
## 3 ineligible genes (gene test) 37606
## 4 DTU genes (transc. test) 652
## 5 non-DTU genes (transc. test) 20241
## 6 ineligible genes (transc. test) 37158
## 7 DTU genes (both tests) 634
## 8 non-DTU genes (both tests) 19610
## 9 ineligible genes (both tests) 37606
## 10 DTU transcripts 833
## 11 non-DTU transcripts 157890
## 12 ineligible transcripts 39279
print( dtu_switch_summary(rat_ipf_87) )
## category genes
## 1 Primary switch (gene test) 615
## 2 Non-primary switch (gene test) 670
## 3 Primary switch (transc. test) 486
## 4 Non-primary switch (transc. test) 522
## 5 Primary switch (both tests) 470
## 6 Non-primary switch (both tests) 506
Get corresponding IDs
allids <- get_dtu_ids(rat_ipf_60)
allids_87 <- get_dtu_ids(rat_ipf_87)
# Gene level
rids60 <- allids[[1]]
rids87 <- allids_87[[1]]
# Genes aggregated from transcript level
rids60_2 <- allids[[4]]
rids87_2 <- allids_87[[4]]
# Genes reported by both methods
rids60_3 <- allids[[7]]
rids87_3 <- allids_87[[7]]
# Transcripts
rids60_4 <- allids[[10]]
rids87_4 <- allids_87[[10]]
Most conservative set of genes (both methods, both annotations)
print( length(intersect(allids[[7]], allids_87[[7]])) )
## [1] 208
Save ID lists to files.
write.table(allids[[7]], file=file.path(basedir,'results', 'dtu60_gene_ids.txt'), row.names=FALSE, col.names=FALSE, quote=FALSE)
write.table(allids_87[[7]], file=file.path(basedir,'results', 'dtu87_gene_ids.txt'), row.names=FALSE, col.names=FALSE, quote=FALSE)
# Genes, Genes aggregated from transcript predictions, Genes by both methods, Transcripts.
print(c( length(intersect(rids60, rids87)), length(intersect(rids60_2, rids87_2)), length(intersect(rids60_3, rids87_3)), length(intersect(rids60_4, rids87_4)) ))
## [1] 272 213 208 223
# Genes (aggregated from transcripts)
sids60 <- unique(as.character(suppa60$gene_id[ !is.na(suppa60$DTU) & suppa60$DTU]) )
sids87 <- unique(as.character(suppa87$gene_id[ !is.na(suppa87$DTU) & suppa87$DTU]) )
print( c(length(sids60), length(sids87)) )
## [1] 780 753
# Transcripts
sids60_2 <- as.character(suppa60$transc_id[ !is.na(suppa60$DTU) & suppa60$DTU])
sids87_2 <- as.character(suppa87$transc_id[ !is.na(suppa87$DTU) & suppa87$DTU])
print(c( count(!is.na(suppa60$DTU) & suppa60$DTU), count(!is.na(suppa87$DTU) & suppa87$DTU) ))
## [1] 1391 1252
# Genes aggregated from transcript predictions, Transcripts.
print(c( length(intersect(sids60, sids87)), length(intersect(sids60_2, sids87_2)) ))
## [1] 257 374
# Genes
dids60 <- as.character( drim60$gene_id[!is.na(drim60$DTU) & drim60$DTU] )
dids87 <- as.character( drim87$gene_id[!is.na(drim87$DTU) & drim87$DTU] )
print( c(length(dids60), length(dids87)) )
## [1] 987 1680
print( length(intersect(dids60, dids87)) )
## [1] 541
These are only the genes with DTU but no DGE. This is not directly comparable to the output of the tools, as DGE is not taken into account there. However, the instersection of the sets is still informative, as the non-DGE DTU genes should be a subset of all the DTU genes.
# Verify number of DTU genes according to main article: 248
assert_that( 248 == length(deds[(DS_fdr < 0.05), Gene]) )
## [1] TRUE
# DTU gene IDs
pgids <- as.character(gn2gid[ as.character(deds[(DS_fdr < 0.05), Gene]), parent_id])
One threshold is swept through its range, while all the others are kept at the default values. Default values for significance (all three tools), effect size (RATs and SUPPA2), and techninal reproducibility (RATs only) are:
print( rat_ipf_87$Parameters[c('p_thresh', 'dprop_thresh', 'quant_reprod_thresh')] )
## $p_thresh
## [1] 0.05
##
## $dprop_thresh
## [1] 0.2
##
## $quant_reprod_thresh
## [1] 0.95
SUPPA2 reports per isoform significances. For equivalence it is compared to the RATs transcript-level predictions. For both tools, the predictions were made by applying thresholds to the statistical significance and the effect size.
# P-value (transcript level)
pvec <- seq(0, 1, 0.01)
xlen <- length(pvec)
r87t <- copy(rat_ipf_87$Transcripts)
s87t <- copy(suppa87)
ovec <- sapply(pvec, function(p) {
r87t[(elig), sig := pval_corr < p ]
r87t[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87t$target_id[ !is.na(r87t$DTU) & r87t$DTU ] )
s87t[, DTU := (pval < p & abs(dpsi) >= dythresh)]
s87 <- as.character(s87t$transc_id[ !is.na(s87t$DTU) & s87t$DTU])
rs <- intersect(r87, s87)
return(length(rs))
})
s87t <- copy(suppa87)
svec <- sapply(pvec, function(p) {
s87t[, DTU := (pval < p & abs(dpsi) >= dythresh)]
s87 <- as.character(s87t$transc_id[ !is.na(s87t$DTU) & s87t$DTU])
return(length(s87))
})
r87t <- copy(rat_ipf_87$Transcripts)
rvec <- sapply(pvec, function(p) {
r87t[(elig), sig := pval_corr < p ]
r87t[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87t$target_id[ !is.na(r87t$DTU) & r87t$DTU ] )
return(length(r87))
})
# Set up data
df1 <- data.frame(p_cutoff=rep(pvec, times=2),
transcripts=c(svec, rvec),
tool=c(rep('SUPPA2', times=xlen), rep('RATs', times=xlen) ))
df2 <- data.frame(p_cutoff=rep(pvec, times=2),
overlap=c(ovec/rvec, ovec/svec),
tool=c(rep('RATs', times=xlen), rep('SUPPA2', times=xlen) ))
# Totals
ggplot(df1) + gth +
geom_vline(xintercept=pthresh, colour='grey50') +
geom_line(aes(p_cutoff, y=log10(transcripts + 1), colour=tool)) +
ggtitle('RATs & SUPPA2 totals VS significance cutoff') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple'))+
coord_cartesian(xlim=c(0, 1), ylim=c(0, 4))
# Overlap
ggplot(df2) + gth +
geom_vline(xintercept=pthresh, colour='grey50') +
geom_line(aes(p_cutoff, y=overlap, colour=tool)) +
ggtitle('RATs & SUPPA2 overlap VS significance cutoff') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple'))+
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1))
## Warning: Removed 2 rows containing missing values (geom_path).
RATs’ response saturates very quickly, remains steady for a section, and then rises again slowly as the cutoff goes through low abundance transcripts with low p-values (this second increase stage disappears if abundances below 5 reads are filtered out). SUPPA2’s number of reported DTU transcripts rises quickly and levels off for significances over 0.25 while having reached an order of magnitude more DTU transcripts than RATs, at which point the overlap encompasses the large majority of RAT’s results. Near the default cutoff of 0.05, RATs and SUPPA2 almost coincide in terms of output volume and in terms of overlap. The ability of RATs to overlap with SUPPA2 is highest for the strictest of significances and drops continuously at a rate matching SUPPA2’s rising totals. In return, SUPPA2 recaptures most of RATs’ reported transcripts when the cutoff isaround 0.3. Just below the default cutoff of 0.05, the overlap is equally poor for both tools.
# Effect size (transcript level)
dvec <- seq(0, 1, 0.01)
xlen <- length(dvec)
r87t <- copy(rat_ipf_87$Transcripts)
s87t <- copy(suppa87)
ovec <- sapply(dvec, function(d) {
r87t[(elig), elig_fx := abs(Dprop) >= d ]
r87t[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87t$target_id[ !is.na(r87t$DTU) & r87t$DTU ] )
s87t[, DTU := (pval < pthresh & abs(dpsi) >= d)]
s87 <- as.character(s87t$transc_id[ !is.na(s87t$DTU) & s87t$DTU])
rs <- intersect(r87, s87)
return(length(rs))
})
s87t <- copy(suppa87)
svec <- sapply(dvec, function(d) {
s87t[, DTU := (pval < pthresh & abs(dpsi) >= d)]
s87 <- as.character(s87t$transc_id[ !is.na(s87t$DTU) & s87t$DTU])
return(length(s87))
})
r87t <- copy(rat_ipf_87$Transcripts)
rvec <- sapply(dvec, function(d) {
r87t[(elig), elig_fx := abs(Dprop) >= d ]
r87t[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87t$target_id[ !is.na(r87t$DTU) & r87t$DTU ] )
return(length(r87))
})
# Set up data
df1 <- data.frame(Dprop_threshold=rep(pvec, times=2),
transcripts=c(svec, rvec),
tool=c(rep('SUPPA2', times=xlen), rep('RATs',times=xlen) ))
df2 <- data.frame(Dprop_threshold=rep(pvec, times=2),
overlap=c(ovec/svec, ovec/rvec),
tool=c(rep('SUPPA2', times=xlen), rep('RATs',times=xlen) ))
# Totals
ggplot(df1) + gth +
geom_vline(xintercept=dythresh, colour='grey50') +
geom_line(aes(Dprop_threshold, y=log10 (transcripts + 1), colour=tool)) +
ggtitle('RATs & SUPPA2 totals VS effect size threshold') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple')) +
coord_cartesian(xlim=c(0,1), ylim=c(0,5))
# Overlap
ggplot(df2) + gth +
geom_vline(xintercept=dythresh, colour='grey50') +
geom_line(aes(Dprop_threshold, y=overlap, colour=tool)) +
ggtitle('RATs & SUPPA2 overlap VS effect size threshold') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple'))+
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1))
## Warning: Removed 9 rows containing missing values (geom_path).
SUPPA2 shows a uniform response up to the default threshold value of 0.20. This flat section may be attributable to small effect sizes receiving low p-values. On the other hand, RATs shows a very high level of overpredictions at low effect sizes, but drops rapidly until the default threshold. From there onwards RATs and SUPPA2 show very similar totals. The overlap between the two tools is comparable and decreases between the 0.25 and 0.65. Beyond that point the extent of overlap becomes quite noisy as the absolute number of transcripts is fairly small. The maximum overlap relative to RATs results comes around the default threshold values of 0.2. Relative to SUPPA2’s results, the maximum overlap is achieved slightly higher at 0.3, ignoring for the noisy upper part of the effect size range.
Overall, from these two plots, RATs appears to be controlled more by the effect size, whereas SUPPA2 is controlled more by the p-value.
# Quantification reroducibility (transcript level)
# SUPPA2 does not have this feature, so the response will be a uniform line across the range.
qvec <- seq(0, 1, 0.01)
r87t <- copy(rat_ipf_87$Transcripts)
ovec <- sapply(qvec, function(q) {
r87t[(elig), quant_reprod := quant_dtu_freq >= q ]
r87t[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87t$target_id[ !is.na(r87t$DTU) & r87t$DTU ] )
rs87 <- intersect(r87, sids87_2)
return(length(rs87))
})
svec <- rep(length(sids87_2), length(qvec))
r87t <- copy(rat_ipf_87$Transcripts)
rvec <- sapply(qvec, function(q) {
r87t[(elig), quant_reprod := quant_dtu_freq >= q ]
r87t[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87t$target_id[ !is.na(r87t$DTU) & r87t$DTU ] )
return(length(r87))
})
# Set up data
df1 <- data.frame(reprod_threshold=rep(pvec, times=2),
transcripts=c(svec, rvec),
tool=c(rep('SUPPA2', times=xlen), rep('RATs',times=xlen) ))
df2 <- data.frame(reprod_threshold=rep(pvec, times=2),
overlap=c(ovec/svec, ovec/rvec),
tool=c(rep('SUPPA2', times=xlen), rep('RATs',times=xlen) ))
# Totals
ggplot(df1) + gth +
geom_vline(xintercept=repthresh, colour='grey50') +
geom_line(aes(reprod_threshold, y=log10 (transcripts + 1), colour=tool)) +
ggtitle('RATs & SUPPA2 totals VS reproducibility threshold') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple')) +
coord_cartesian(xlim=c(0,1), ylim=c(0,5))
# Overlap
ggplot(df2) + gth +
geom_vline(xintercept=repthresh, colour='grey50') +
geom_line(aes(reprod_threshold, y=overlap, colour=tool)) +
ggtitle('RATs & SUPPA2 overlap VS reproducibility threshold') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple'))+
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1))
RATs’ number of reported transcripts is steady up to 0.5 (50% of iterations lead to positive DTU). After this point, RATs starts rejecting transcripts at a steady logarithmic pace until the top end of the range, where the rejection rate steeply increases. Until the 0.5 point, the overlap of the two tools encompasses the majority of SUPPA2’s results. Above the middle, the overlap decreases relative to SUPPA2 and increases relative to RATs.
DRIMSeq reports significance at the gene level. For equivalence, we use RATs’ gene-level predictions.
# P-value (gene level)
pvec <- seq(0, 1, 0.01)
r87g <- copy(rat_ipf_87$Genes)
d87g <- copy(drim87)
ovec <- sapply(pvec, function(p) {
r87g[(elig), sig := pval_corr < p ]
r87g[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87g$parent_id[ !is.na(r87g$DTU) & r87g$DTU ] )
d87g[, DTU := adj_pvalue < p]
d87 <- as.character( d87g$gene_id[!is.na(d87g$DTU) & d87g$DTU] )
rd <- intersect(r87, d87)
return(length(rd))
})
d87g <- copy(drim87)
dvec <- sapply(pvec, function(p) {
d87g[, DTU := adj_pvalue < p]
d87 <- as.character( d87g$gene_id[!is.na(d87g$DTU) & d87g$DTU] )
return(length(d87))
})
r87g <- copy(rat_ipf_87$Genes)
rvec <- sapply(pvec, function(p) {
r87g[(elig), sig := pval_corr < p ]
r87g[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87g$parent_id[ !is.na(r87g$DTU) & r87g$DTU ] )
return(length(r87))
})
# Set up data
df1 <- data.frame(p_cutoff=rep(pvec, times=2),
genes=c(dvec, rvec),
tool=c(rep('DRIMseq', times=xlen), rep('RATs',times=xlen) ))
df2 <- data.frame(p_cutoff=rep(pvec, times=2),
overlap=c(ovec/dvec, ovec/rvec),
tool=c(rep('DRIMseq', times=xlen), rep('RATs',times=xlen) ))
# Totals
ggplot(df1) + gth +
geom_vline(xintercept=pthresh, colour='grey50') +
geom_line(aes(p_cutoff, y=log10 (genes + 1), colour=tool)) +
ggtitle('RATs & DRIM-seq totals VS significance cutoff') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple')) +
coord_cartesian(xlim=c(0,1), ylim=c(0,5))
# Overlap
ggplot(df2) + gth +
geom_vline(xintercept=pthresh, colour='grey50') +
geom_line(aes(p_cutoff, y=overlap, colour=tool)) +
ggtitle('RATs & DRIM-seq overlap VS significance cutoff') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple'))+
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1))
## Warning: Removed 2 rows containing missing values (geom_path).
RATs behaves the same way at the gene level as it did at the transcript level: a sharp rise in reported genes followed by plateau and then a second phase of increase as low abundance genes come into play. The number of genes reported by DRIM-seq rises continuously through the range, reaching over an order of magnitude more genes than RATs. Unlike SUPPA2, DRIM-seq does not reach a saturation point. With regards to RATs’ results, the overlap increases until 0.3 and then levels off. With regards to DRIM-seq, the overlap decreases through the entire range. They both start around a 20% overlap for the strictest significance cutoff.
# Quantification reproducibility (gene level)
# DRIMSeq dos not have this feature, so the response will be a uniform line across the range.
qvec <- seq(0, 1, 0.01)
r87g <- copy(rat_ipf_87$Genes)
ovec <- sapply(qvec, function(q) {
r87g[(elig), quant_reprod := quant_dtu_freq >= q ]
r87g[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87g$parent_id[ !is.na(r87g$DTU) & r87g$DTU ] )
rd87g <- intersect(r87, dids87)
return(length(rd87g))
})
dvec <- rep(length(dids87), length(qvec))
r87g <- copy(rat_ipf_87$Genes)
rvec <- sapply(qvec, function(q) {
r87g[(elig), quant_reprod := quant_dtu_freq >= q ]
r87g[(elig), DTU := elig & sig & elig_fx & quant_reprod]
r87 <- as.character( r87g$parent_id[ !is.na(r87g$DTU) & r87g$DTU ] )
return(length(r87))
})
# Set up data
df1 <- data.frame(reprod_threshold=rep(pvec, times=2),
genes=c(dvec, rvec),
tool=c(rep('DRIMseq', times=xlen), rep('RATs',times=xlen) ))
df2 <- data.frame(reprod_threshold=rep(pvec, times=2),
overlap=c(ovec/dvec, ovec/rvec),
tool=c(rep('DRIMseq', times=xlen), rep('RATs',times=xlen) ))
# Totals
ggplot(df1) + gth +
geom_vline(xintercept=repthresh, colour='grey50') +
geom_line(aes(reprod_threshold, y=log10 (genes + 1), colour=tool)) +
ggtitle('RATs & DRIM-seq totals VS reproducibility threshold') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple')) +
coord_cartesian(xlim=c(0,1), ylim=c(0,5))
# Overlap
ggplot(df2) + gth +
geom_vline(xintercept=repthresh, colour='grey50') +
geom_line(aes(reprod_threshold, y=overlap, colour=tool)) +
ggtitle('RATs & DRIM-seq overlap VS reproducibility threshold') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(values=c(RATs='red', SUPPA2='blue', DRIMseq='purple'))+
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1))
RATs starts with more genes reported than DRIMSeq, until the threshold reaches 0.85 where the relationship inverses. The overlap stays well below either tool’s results through the whole range. The overlap increases relative to RATs’ results, and decreases relative to DRIM-seq’s and the two trends meet around 0.85 with an overlap just over 30% for both tools.
All proportions are relative to the genes reported in the original study.
# gene-level calls
rp60 <- intersect(rids60, pgids)
rp87 <- intersect(rids87, pgids)
print( c(length(rp60)/length(pgids), length(rp87)/length(pgids)) )
## [1] 0.1129032 0.1250000
# genes aggregated from transcript-level calls
rp60 <- intersect(rids60_2, pgids)
rp87 <- intersect(rids87_2, pgids)
print( c(length(rp60)/length(pgids), length(rp87)/length(pgids)) )
## [1] 0.1129032 0.1250000
# genes found by both methods
rp60 <- intersect(rids60_3, pgids)
rp87 <- intersect(rids87_3, pgids)
print( c(length(rp60)/length(pgids), length(rp87)/length(pgids)) )
## [1] 0.1129032 0.1250000
sp60 <- intersect(sids60, pgids)
sp87 <- intersect(sids87, pgids)
print( c(length(sp60)/length(pgids), length(sp87)/length(pgids)) )
## [1] 0.1653226 0.1209677
dp60 <- intersect(dids60, pgids)
dp87 <- intersect(dids87, pgids)
print( c(length(dp60)/length(pgids), length(dp87)/length(pgids)) )
## [1] 0.2580645 0.2822581
Match gene IDs to the gene names
ex1 <- as.character(gn2gid["TOM1L1", parent_id])
ex2 <- as.character(gn2gid["CMTM4", parent_id])
ex3 <- as.character(gn2gid["PEX11B", parent_id])
print( c(ex1, ex2, ex3) )
## [1] "ENSG00000141198" "ENSG00000183723" "ENSG00000131779"
# RATs Gene level
print( rat_ipf_60$Genes[c(ex1,ex2,ex3), .(parent_id, known_transc, elig_transc, DTU, pval_corr, maxDprop, quant_dtu_freq)] )
## parent_id known_transc elig_transc DTU pval_corr maxDprop
## 1: ENSG00000141198 2 2 FALSE 2.282536e-06 0.2373295
## 2: ENSG00000183723 2 2 TRUE 3.115858e-18 0.3984406
## 3: ENSG00000131779 2 2 FALSE 2.275480e-26 0.2161318
## quant_dtu_freq
## 1: 0.669
## 2: 1.000
## 3: 0.677
# RATs Transcript level
print( rat_ipf_60$Transcripts[c(ex1,ex2,ex3), .(target_id, parent_id, DTU, pval_corr, Dprop, quant_dtu_freq)] )
## target_id parent_id DTU pval_corr Dprop
## 1: ENST00000348161 ENSG00000141198 FALSE 6.386841e-06 0.2373295
## 2: ENST00000445275 ENSG00000141198 FALSE 6.386841e-06 -0.2373295
## 3: ENST00000330687 ENSG00000183723 TRUE 1.361254e-17 0.3984406
## 4: ENST00000394106 ENSG00000183723 TRUE 1.361254e-17 -0.3984406
## 5: ENST00000369306 ENSG00000131779 FALSE 1.123613e-25 -0.2161318
## 6: ENST00000428634 ENSG00000131779 FALSE 1.123613e-25 0.2161318
## quant_dtu_freq
## 1: 0.669
## 2: 0.669
## 3: 1.000
## 4: 1.000
## 5: 0.677
## 6: 0.677
# SUPPA2
print( suppa60[c(ex1,ex2,ex3), ] )
## event dpsi pval gene_id
## 1: ENSG00000141198;ENST00000348161 0.2510453 0.042957043 ENSG00000141198
## 2: ENSG00000141198;ENST00000445275 -0.2510453 0.042957043 ENSG00000141198
## 3: ENSG00000183723;ENST00000330687 0.3936542 0.008491508 ENSG00000183723
## 4: ENSG00000183723;ENST00000394106 -0.3936542 0.008491508 ENSG00000183723
## 5: ENSG00000131779;ENST00000369306 -0.2436377 0.014485514 ENSG00000131779
## 6: ENSG00000131779;ENST00000428634 0.2436377 0.014485514 ENSG00000131779
## transc_id DTU
## 1: ENST00000348161 TRUE
## 2: ENST00000445275 TRUE
## 3: ENST00000330687 TRUE
## 4: ENST00000394106 TRUE
## 5: ENST00000369306 TRUE
## 6: ENST00000428634 TRUE
# DRIMSEQ
print( drim60[c(ex1,ex2,ex3), ] )
## gene_id lr df pvalue adj_pvalue DTU
## 1: ENSG00000141198 7.342972 1 6.732543e-03 0.0813511472 FALSE
## 2: ENSG00000183723 22.074237 1 2.623066e-06 0.0002986834 TRUE
## 3: ENSG00000131779 7.713477 1 5.481008e-03 0.0717255316 FALSE
Plot abundances
# TOM1L1
print( plot_gene(rat_ipf_60, ex1) )
# Cleaner version
print( plot_gene(rat_ipf_60, ex1, style="byisoform", shapeby='none', colourby='condition', condcolvec=c('red','blue'))
# +
# guides(colour="none", fill='none') +
# scale_x_discrete(labels=seq.int(1,rat_ipf_60$Genes[ex1, known_transc])) +
# theme(axis.text.x=element_text(angle=0)) +
# scale_y_continuous(position='right', sec.axis=waiver())
)
# CMTM4
print( plot_gene(rat_ipf_60, ex2) )
# Cleaner version
print( plot_gene(rat_ipf_60, ex2, style="byisoform", shapeby='none', colourby='condition', condcolvec=c('red','blue'))
# +
# guides(colour="none", fill='none') +
# scale_x_discrete(labels=seq.int(1,rat_ipf_60$Genes[ex2, known_transc])) +
# theme(axis.text.x=element_text(angle=0)) +
# scale_y_continuous(position='right', sec.axis=waiver())
)
# PEX11B
print( plot_gene(rat_ipf_60, ex3) )
# Cleaner version
print( plot_gene(rat_ipf_60, ex3, style="byisoform", shapeby='none', colourby='condition', condcolvec=c('red','blue'))
# +
# guides(colour="none", fill='none') +
# scale_x_discrete(labels=seq.int(1,rat_ipf_60$Genes[ex3, known_transc])) +
# theme(axis.text.x=element_text(angle=0)) +
# scale_y_continuous(position='right', sec.axis=waiver())
)
# RATs Gene level
print( rat_ipf_87$Genes[c(ex1,ex2,ex3), .(parent_id, known_transc, elig_transc, DTU, pval_corr, maxDprop, quant_dtu_freq)] )
## parent_id known_transc elig_transc DTU pval_corr maxDprop
## 1: ENSG00000141198 23 23 FALSE 9.472062e-18 0.09733825
## 2: ENSG00000183723 5 5 TRUE 2.954430e-34 0.36939499
## 3: ENSG00000131779 3 3 TRUE 3.158754e-42 -0.25642376
## quant_dtu_freq
## 1: 0.105
## 2: 1.000
## 3: 0.960
# RATs Transcript level
print( rat_ipf_87$Transcripts[c(ex1,ex2,ex3), .(target_id, parent_id, DTU, pval_corr, Dprop, quant_dtu_freq)] )
## target_id parent_id DTU pval_corr Dprop
## 1: ENST00000348161 ENSG00000141198 FALSE 9.327874e-01 -6.099906e-03
## 2: ENST00000445275 ENSG00000141198 FALSE 7.497407e-01 3.217174e-03
## 3: ENST00000536554 ENSG00000141198 FALSE 4.032820e-01 -1.957579e-02
## 4: ENST00000570371 ENSG00000141198 FALSE 1.000000e+00 0.000000e+00
## 5: ENST00000570499 ENSG00000141198 FALSE 3.769740e-01 1.543703e-02
## 6: ENST00000570623 ENSG00000141198 FALSE 7.365274e-02 2.213919e-02
## 7: ENST00000570965 ENSG00000141198 FALSE 5.016414e-03 -1.447868e-02
## 8: ENST00000570977 ENSG00000141198 FALSE 2.615997e-05 -6.135538e-02
## 9: ENST00000571319 ENSG00000141198 FALSE 1.197226e-03 4.830173e-02
## 10: ENST00000572158 ENSG00000141198 FALSE 4.460099e-05 -4.779305e-02
## 11: ENST00000572298 ENSG00000141198 FALSE 6.219603e-01 4.816911e-03
## 12: ENST00000572360 ENSG00000141198 FALSE 3.237801e-03 -1.575566e-02
## 13: ENST00000572405 ENSG00000141198 FALSE 9.678785e-01 -6.581152e-04
## 14: ENST00000572576 ENSG00000141198 FALSE 8.353608e-01 8.269278e-03
## 15: ENST00000572905 ENSG00000141198 FALSE 2.830548e-03 -1.614793e-02
## 16: ENST00000573607 ENSG00000141198 FALSE 1.000000e+00 0.000000e+00
## 17: ENST00000574318 ENSG00000141198 FALSE 5.614916e-06 9.733825e-02
## 18: ENST00000574653 ENSG00000141198 FALSE 5.350622e-03 -6.799348e-02
## 19: ENST00000574744 ENSG00000141198 FALSE 1.000000e+00 0.000000e+00
## 20: ENST00000575333 ENSG00000141198 FALSE 5.973969e-02 3.004371e-02
## 21: ENST00000575882 ENSG00000141198 FALSE 2.032758e-02 8.745145e-03
## 22: ENST00000575909 ENSG00000141198 FALSE 1.000000e+00 1.814385e-06
## 23: ENST00000576932 ENSG00000141198 FALSE 4.006414e-01 1.154775e-02
## 24: ENST00000330687 ENSG00000183723 TRUE 6.223203e-28 3.693950e-01
## 25: ENST00000394106 ENSG00000183723 FALSE 2.695784e-01 -2.418535e-02
## 26: ENST00000561680 ENSG00000183723 FALSE 2.239451e-14 -1.570551e-01
## 27: ENST00000563952 ENSG00000183723 FALSE 4.925613e-03 -6.532788e-02
## 28: ENST00000581487 ENSG00000183723 FALSE 3.240708e-03 -1.228267e-01
## 29: ENST00000369306 ENSG00000131779 TRUE 7.091002e-42 -2.564238e-01
## 30: ENST00000428634 ENSG00000131779 FALSE 2.950140e-34 2.346055e-01
## 31: ENST00000537888 ENSG00000131779 FALSE 1.213258e-03 2.181822e-02
## target_id parent_id DTU pval_corr Dprop
## quant_dtu_freq
## 1: 0.002
## 2: 0.000
## 3: 0.004
## 4: 0.000
## 5: 0.000
## 6: 0.000
## 7: 0.000
## 8: 0.004
## 9: 0.006
## 10: 0.001
## 11: 0.000
## 12: 0.000
## 13: 0.000
## 14: 0.001
## 15: 0.000
## 16: 0.000
## 17: 0.069
## 18: 0.028
## 19: 0.000
## 20: 0.000
## 21: 0.000
## 22: 0.000
## 23: 0.000
## 24: 1.000
## 25: 0.000
## 26: 0.158
## 27: 0.000
## 28: 0.007
## 29: 0.958
## 30: 0.845
## 31: 0.000
## quant_dtu_freq
# SUPPA2
print( suppa87[c(ex1,ex2,ex3), .(transc_id, gene_id, DTU, pval, dpsi)] )
## transc_id gene_id DTU pval dpsi
## 1: ENST00000348161 ENSG00000141198 FALSE 0.370629371 0.0294209549
## 2: ENST00000445275 ENSG00000141198 FALSE 0.370629371 -0.0084303424
## 3: ENST00000536554 ENSG00000141198 FALSE 0.370629371 -0.0383036830
## 4: ENST00000570371 ENSG00000141198 FALSE 0.370629371 0.0000000000
## 5: ENST00000570499 ENSG00000141198 FALSE 0.370629371 0.0105882791
## 6: ENST00000570623 ENSG00000141198 FALSE 0.370629371 0.0307256351
## 7: ENST00000570965 ENSG00000141198 FALSE 0.370629371 -0.0128215734
## 8: ENST00000570977 ENSG00000141198 FALSE 0.370629371 -0.0784246574
## 9: ENST00000571319 ENSG00000141198 FALSE 0.370629371 0.0415481876
## 10: ENST00000572158 ENSG00000141198 FALSE 0.370629371 -0.0417906430
## 11: ENST00000572298 ENSG00000141198 FALSE 0.370629371 0.0096458862
## 12: ENST00000572360 ENSG00000141198 FALSE 0.370629371 -0.0239342890
## 13: ENST00000572405 ENSG00000141198 FALSE 0.370629371 -0.0009666681
## 14: ENST00000572576 ENSG00000141198 FALSE 0.370629371 0.0069294058
## 15: ENST00000572905 ENSG00000141198 FALSE 0.370629371 -0.0140380470
## 16: ENST00000573607 ENSG00000141198 FALSE 0.370629371 0.0000000000
## 17: ENST00000574318 ENSG00000141198 FALSE 0.370629371 0.0699875223
## 18: ENST00000574653 ENSG00000141198 FALSE 0.370629371 -0.0384749956
## 19: ENST00000574744 ENSG00000141198 FALSE 0.370629371 0.0000000000
## 20: ENST00000575333 ENSG00000141198 FALSE 0.370629371 0.0358719823
## 21: ENST00000575882 ENSG00000141198 FALSE 0.370629371 0.0106521247
## 22: ENST00000575909 ENSG00000141198 FALSE 0.370629371 0.0000000000
## 23: ENST00000576932 ENSG00000141198 FALSE 0.370629371 0.0118149207
## 24: ENST00000330687 ENSG00000183723 TRUE 0.047452547 0.3510659052
## 25: ENST00000394106 ENSG00000183723 FALSE 0.300699301 -0.0102984652
## 26: ENST00000561680 ENSG00000183723 FALSE 0.146103896 -0.1566397247
## 27: ENST00000563952 ENSG00000183723 FALSE 0.170454545 -0.0732263313
## 28: ENST00000581487 ENSG00000183723 FALSE 0.147352647 -0.1109013839
## 29: ENST00000369306 ENSG00000131779 TRUE 0.005244755 -0.2641838975
## 30: ENST00000428634 ENSG00000131779 TRUE 0.005244755 0.2429758914
## 31: ENST00000537888 ENSG00000131779 FALSE 0.214285714 0.0212080060
## transc_id gene_id DTU pval dpsi
# DRIMSEQ
print( drim87[c(ex1,ex2,ex3), .(gene_id, DTU, adj_pvalue, lr)] )
## gene_id DTU adj_pvalue lr
## 1: ENSG00000141198 FALSE 0.43275492 23.031162
## 2: ENSG00000183723 TRUE 0.02130873 18.042706
## 3: ENSG00000131779 FALSE 0.10542350 8.397229
Plot abundances
# TOM1L1
print( plot_gene(rat_ipf_87, ex1) )
# Cleaner version
print( plot_gene(rat_ipf_87, ex1, style="byisoform", shapeby='none', colourby='condition', condcolvec=c('red','blue'))
# +
# guides(colour="none", fill='none') +
# scale_x_discrete(labels=seq.int(1,rat_ipf_87$Genes[ex1, known_transc])) +
# theme(axis.text.x=element_text(angle=0)) +
# scale_y_continuous(position='right', sec.axis=waiver())
)
# CMTM4
print( plot_gene(rat_ipf_87, ex2) )
# Cleaner version
print( plot_gene(rat_ipf_87, ex2, style="byisoform", shapeby='none', colourby='condition', condcolvec=c('red','blue'))
# +
# guides(colour="none", fill='none') +
# scale_x_discrete(labels=seq.int(1,rat_ipf_87$Genes[ex2, known_transc])) +
# theme(axis.text.x=element_text(angle=0)) +
# scale_y_continuous(position='right', sec.axis=waiver())
)
# PEX11B
print( plot_gene(rat_ipf_87, ex3) )
# Cleaner version
print( plot_gene(rat_ipf_87, ex3, style="byisoform", shapeby='none', colourby='condition', condcolvec=c('red','blue'))
# +
# guides(colour="none", fill='none') +
# scale_x_discrete(labels=seq.int(1,rat_ipf_87$Genes[ex3, known_transc])) +
# theme(axis.text.x=element_text(angle=0)) +
# scale_y_continuous(position='right', sec.axis=waiver())
)
Can I make sense of the differences between the tools’ reports?
RATs criteria are applied sequentially. Thus we can trace back which criteria are responsible for filtering out features that other tools report as positive hits.
# SUPPA2 (transcript level)
lrs <- length(rs87_2)
# Transcripts reported by SUPPA2 that are not reported by RATs.
ids <- rat_ipf_87$Transcripts$target_id %in% setdiff(sids87_2, rs87_2)
# Cumulative pass rate, as percentage of SUPPA2's total results, for the given thresholds.
df1 <- data.frame(value=c( lrs + dim(rat_ipf_87$Transcripts[ids,])[1],
lrs + count(rat_ipf_87$Transcripts[ids, elig_xp]),
lrs + count(rat_ipf_87$Transcripts[ids & elig_xp, elig]),
lrs + count(rat_ipf_87$Transcripts[ids & elig_xp & elig, sig]),
lrs + count(rat_ipf_87$Transcripts[ids & elig_xp & elig & sig, elig_fx]),
lrs + count(rat_ipf_87$Transcripts[ids & elig_xp & elig & sig & elig_fx, quant_reprod]) ) / length(sids87_2) * 100,
criterion=factor( c('before filters', 'abundance', '#_isoforms', 'significance', 'effect size', 'reproducibility'),
levels=c('before filters', 'abundance', '#_isoforms', 'significance', 'effect size', 'reproducibility') ) )
ggplot(df1) + gth +
geom_line(aes(x=criterion, y=value, group=1), colour='blue') +
ggtitle('Pass rate of unique SUPPA2 results through RATs') +
ylab('% SUPPA2 transcripts') +
xlab('RATs filter') +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0, 101)) +
theme(axis.text.x=element_text(hjust=1, angle=30))
# Individual contributions, as percentage of SUPPA2's total results, for the given thresholds.
df2 <- data.frame(value=c( count(rat_ipf_87$Transcripts[ids, !elig_xp]),
count(rat_ipf_87$Transcripts[ids & elig_xp, !elig]),
count(rat_ipf_87$Transcripts[ids & elig_xp & elig, !sig]),
count(rat_ipf_87$Transcripts[ids & elig_xp & elig & sig, !elig_fx]),
count(rat_ipf_87$Transcripts[ids & elig_xp & elig & sig & elig_fx, !quant_reprod]) ) / length(sids87_2) * 100,
criterion=factor( c('abundance', '#_isoforms', 'significance', 'effect size', 'reproducibility'),
levels=c('abundance', '#_isoforms', 'significance', 'effect size', 'reproducibility') ),
dummy=rep('dummy',5) )
print( df2[,c(1,2)] )
## value criterion
## 1 0.000000 abundance
## 2 0.000000 #_isoforms
## 3 3.194888 significance
## 4 18.450479 effect size
## 5 43.370607 reproducibility
ggplot(df2) + gth +
geom_bar(aes(x=dummy, y=value, fill=criterion), stat='identity', position='stack', width=0.7) +
ggtitle('Cause of rejection of SUPPA2 results by RATs') +
ylab('% SUPPA2 transcripts') +
xlab('') +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0, 100)) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values=c('green', 'purple','red','orange', 'blue'))
# Manuscript version of the above
df2 <- data.frame(value=c( count(rat_ipf_87$Transcripts[ids & elig_xp & elig, !sig]),
count(rat_ipf_87$Transcripts[ids & elig_xp & elig & sig, !elig_fx]),
count(rat_ipf_87$Transcripts[ids & elig_xp & elig & sig & elig_fx, !quant_reprod]) ) / length(sids87_2) * 100,
criterion=factor( c('significance', 'effect size', 'reproducibility'),
levels=c('reproducibility', 'effect size', 'significance') ),
dummy=rep('dummy',3) )
ggplot(df2) + gth +
geom_bar(aes(x=dummy, y=value, fill=criterion), stat='identity', position='stack', width=0.7) +
ggtitle('Cause of rejection of SUPPA2 results by RATs') +
ylab('% SUPPA2 transcripts') +
xlab('') +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0, 100)) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values=c('blue','orange', 'red'))
Most SUPPA2-only hits are statistically significant also according to RATs. However a substantial fraction of them has effect sizes smaller than the threshold, as indicated by the drop in the number of transcripts. The remaining two thirds of the transcripts are eliminated at the reproducibility stage.
# RATs / SUPPA2 (transcripts)
ggplot() + gth +
geom_freqpoly(data=rat_ipf_87$Transcripts[target_id %in% rs87_2,], aes(x=quant_dtu_freq, colour='Overlap'), breaks=seq(0, 1, 0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Transcripts[target_id %in% setdiff(rids87_4, rs87_2),], aes(x=quant_dtu_freq, colour='RATs_only'), breaks=seq(0, 1, 0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Transcripts[target_id %in% setdiff(sids87_2, rs87_2),], aes(x=quant_dtu_freq, colour='SUPPA2_only'), breaks=seq(0, 1, 0.01), alpha=0.8) +
ggtitle('Number of DTU transcripts in common between RATs and SUPPA2') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(name='DTU', values=c(Overlap='red', RATs_only='green', SUPPA2_only='blue', DRIMSeq_only='purple')) +
ylab('Transcripts') +
ggtitle('Distribution of RATs reproducibility')
Although there is a small spike with 0% reproducibility that RATs consistently finds as non-DTU, most transcripts have various reproducibility through the entire range, with a slight increasing trend for higher reproducibility.
Together these plots tell us that for two thirds of SUPPA2’s uniquely reported transcripts, the quantifications are unstable. As a result, although they appear significant and of sufficient effect size when their respective bootstrapped estimates are averaged together, they in fact give conflicting results when considered individually. Lowering the reproducibility threshold from 95% to 50% would pick up many of these transcripts, but when quantifications are so variable they should probably not be trusted without scrutiny.
# DRIM-seq (gene level)
lrd <- length(rd87)
# Genes reported by DRIM-seq that are not reported by RATs.
ids <- rat_ipf_87$Genes$parent_id %in% setdiff(dids87, rd87)
# Cumulative pass rate, as percentage of DRIM-seq's total results, for the given thresholds.
df1 <- data.frame(value=c(lrd + dim(rat_ipf_87$Genes[ids,])[1],
lrd + count(rat_ipf_87$Genes[ids , elig]),
lrd + count(rat_ipf_87$Genes[ids & elig, sig]),
lrd + count(rat_ipf_87$Genes[ids & elig & sig, elig_fx]),
lrd + count(rat_ipf_87$Genes[ids & elig & sig & elig_fx, quant_reprod]) ) / length(dids87) * 100,
criterion=factor( c('origin','#_abund_isoforms','significance','effect size','reproducibility'),
levels=c('origin', '#_abund_isoforms','significance','effect size','reproducibility') ) )
ggplot(df1) + gth +
geom_line(aes(x=criterion, y=value, group=1), colour='purple') +
ggtitle('Pass rate of unique DRIMseq results through RATs') +
ylab('% DRIM-seq genes') +
xlab('RATs filter') +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,101)) +
theme(axis.text.x=element_text(hjust=1, angle=30))
# Individual contributions, as percentage of DRIM-seq's total results, for the given thresholds.
df2 <- data.frame(value=c(count(rat_ipf_87$Genes[ids , !elig]),
count(rat_ipf_87$Genes[ids & elig, !sig]),
count(rat_ipf_87$Genes[ids & elig & sig, !elig_fx]),
count(rat_ipf_87$Genes[ids & elig & sig & elig_fx, !quant_reprod]) ) / length(dids87) * 100,
criterion=factor( c('#_abund_isoforms','significance','effect size','reproducibility'),
levels=c('#_abund_isoforms','significance','effect size','reproducibility') ),
dummy=rep('dummy',4) )
print( df2[,c(1,2)] )
## value criterion
## 1 0.000000 #_abund_isoforms
## 2 0.297619 significance
## 3 52.559524 effect size
## 4 27.619048 reproducibility
ggplot(df2) + gth +
geom_bar(aes(x=dummy, y=value, fill=criterion), stat='identity', position='stack', width=0.7) +
ggtitle('Cause of rejection of DRIM-seq results by RATs ') +
ylab('% DRIM-seq genes') +
xlab('') +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,100)) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values=c('purple','red','orange', 'blue'))
# Manuscript version of the above.
df2 <- data.frame(value=c(count(rat_ipf_87$Genes[ids & elig, !sig]),
count(rat_ipf_87$Genes[ids & elig & sig, !elig_fx]),
count(rat_ipf_87$Genes[ids & elig & sig & elig_fx, !quant_reprod]) ) / length(dids87) * 100,
criterion=factor( c('significance','effect size','reproducibility'),
levels=c('reproducibility','effect size','significance') ),
dummy=rep('dummy',3) )
ggplot(df2) + gth +
geom_bar(aes(x=dummy, y=value, fill=criterion), stat='identity', position='stack', width=0.7) +
ggtitle('Cause of rejection of DRIM-seq results by RATs ') +
ylab('% DRIM-seq genes') +
xlab('') +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,100)) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values=c('blue','orange', 'red'))
RATs finds virtually all of DRIM-seq’s uniquely reported genes to be significant, however there is a major drop when the effect size is considered, with the remaining genes being rejected due to unstable quantifications.
# RATs / DRIMSeq (genes)
ggplot() + gth +
geom_freqpoly(data=rat_ipf_87$Genes[rd87,], aes(x=quant_dtu_freq, colour='Overlap'), breaks=seq(0, 1, 0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Genes[setdiff(rids87, rd87),], aes(x=quant_dtu_freq, colour='RATs_only'), breaks=seq(0, 1, 0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Genes[setdiff(dids87, rd87),], aes(x=quant_dtu_freq, colour='DRIMSeq_only'), breaks=seq(0, 1, 0.01), alpha=0.8) +
ggtitle('Number of DTU genes in common between RATs and DRIMSeq') +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
scale_colour_manual(name='DTU', values=c(Overlap='red', RATs_only='green', SUPPA2_only='blue', DRIMSeq_only='purple')) +
ylab('Transcripts') +
ggtitle('Distribution of RATs reproducibility') +
coord_cartesian(ylim=c(0,400))
The distribution of reproducibility shows that most of DRIM-seq’s uniquely reported genes have 0% reproducibility in RATs, indicating confident classification as non-DTU. There are is also a fraction of genes that are uniformly spread across the reproducibility range, indicating genes with very variable quantifications that should probably not be considered trustworthy.
DRIM-seq uses ratios, whereas RATs uses difference. These two do not correlate, especially when talking about percentages. This is clearly reflected in that there are many genes that DRIM-seq considers DTU that RATs confidently rejects.
ids <- setdiff(rids87_4, rs87_2)
# Significance distribution according to SUPPA2 and RATs, of genes predicted by RATs, but not by SUPPA2
ggplot() + gth +
geom_vline(xintercept=pthresh, colour='grey50') +
geom_freqpoly(data=suppa87[transc_id %in% ids,], aes(x=pval, colour=c('SUPPA2')), breaks=seq(0,1,0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Transcripts[target_id %in% ids,], aes(x=pval_corr, colour=c('RATs')), breaks=seq(0,1,0.01), alpha=0.8) +
ggtitle('SUPPA2 significance of RATs-only results') +
ylab('Number of Transcripts') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_manual(name='DTU', values=c(Overlap='red', RATs='green', SUPPA2='blue', DRIMSeq='purple')) +
coord_cartesian(ylim=c(0,400))
RAT’s assigns very high significance to all of these genes, while SUPPA2’s p-values hint at being the tail of a right-skewed significance distribution, trailing just above the cutoff point. We can consider these to be marginal misses reflecting the methodological differences in how DTU is tested, rather than the product of fundamental differences in concept.
# Effect Size distribution according to SUPPA2 and RATs, of genes predicted by RATs, but not by SUPPA2
ggplot() + gth +
geom_vline(xintercept=dythresh, colour='grey50') +
geom_freqpoly(data=suppa87[transc_id %in% ids,], aes(x=dpsi, colour='SUPPA2'), breaks=seq(0,1,0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Transcripts[target_id %in% ids,], aes(x=Dprop, colour=c('RATs')), breaks=seq(0,1,0.01), alpha=0.8) +
ggtitle('SUPPA2 effect size of RATs-only results') +
ylab('Number of Transcripts') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(0,1), ylim=c(0,20)) +
scale_colour_manual(name='DTU', values=c(Overlap='red', RATs='green', SUPPA2='blue', DRIMSeq='purple'))
A subset of the transcripts predicted only by RATs are reported as below the effect size threshold by SUPPA2. Considering the input data is based on the same TPM quantifications, and that the effect size is defined in a similar way, this difference is harder to explain. But the majority of transcripts do pass the threshold and the distributions of SUPPA2 and RATs are similar for them.
ids <- setdiff(rids87, rd87)
# Significance distribution according to DRIMSeq and RATs, of genes predicted by RATs, but not by DRIMSeq
ggplot() + gth +
geom_vline(xintercept=pthresh, colour='grey50') +
geom_freqpoly(data=drim87[ids,], aes(x=adj_pvalue, colour='DRIMSeq'), breaks=seq(0,1,0.01), alpha=0.8) +
geom_freqpoly(data=rat_ipf_87$Genes[ids,], aes(x=pval_corr, colour=c('RATs')), breaks=seq(0,1,0.01), alpha=0.8) +
ggtitle('Significance of RATs-only results') +
ylab('Number of Genes') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
scale_colour_manual(name='DTU', values=c(Overlap='red', RATs='green', SUPPA2='blue', DRIMSeq='purple')) +
coord_cartesian(ylim=c(0,500))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
The genes reported only by RATs are spread out over the pvalue range in DRIMSeq’s results, but do appear to be the right tail of right-skewed distribution, trailing above the cutoff point. From RATs’ perspective, these are all highly significant genes that do not approach the cutoff point at all.
# Effect size distribution according to DRIMSeq and RATs, of genes predicted by RATs, but not by DRIMSeq
ggplot(drim87[ids,]) + gth +
geom_freqpoly(aes(x=lr), colour='purple', breaks=seq.int(0,30,1)) +
ggtitle('DRIMSeq likelihood ratios of RATs-only results') +
ylab('Number of Genes') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,50))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
ggplot(rat_ipf_87$Genes[ids, ]) + gth +
geom_freqpoly(aes(x=abs(maxDprop)), colour='green', breaks=seq(0,1,0.01)) +
ggtitle('RATs difference in proportions for RATs-only results') +
ylab('Number of Genes') +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
RATs and DRIM-seq use incompatible effect size measures, so they cannot be plotted on the same axes. Most of the RATs-only predicted DTU genes have likelihood ratios greater than 2 with some being greater than 10. Similarly, these genes contain isoforms that change in relative abundance by over 20 percentile points, with a few even changing by over 50 percentile points.
Genes reported across methods and annotations:
# Overlap of reported genes between paper results and RATs results.
# Both Ensembl 60 and 87: both gene-level and transcript-level
gids <- intersect(intersect(rids60_3, rids87_3), pgids)
print( length(gids) )
## [1] 18
Which are these genes and what do they look like?
for( g in gids ){
print(as.character(gn2gid[parent_id==g , Gene]))
print( plot_gene(rat_ipf_60, g, style="lines") )
print( plot_gene(rat_ipf_87, g, style="lines") )
}
## [1] "ITPRIP"
## [1] "SNRNP48"
## [1] "ISLR"
## [1] "SLC40A1"
## [1] "CXCL6"
## [1] "CMTM4"
## [1] "PRPS2"
## [1] "TTC35"
## [1] "MOBKL1B"
## [1] "HSD17B11"
## [1] "ARRDC3"
## [1] "RBPJ"
## [1] "STEAP4"
## [1] "SLAMF8"
## [1] "SCAMP4"
## [1] "LSS"
## [1] "ELL"
## [1] "WAS"
They are mostly genes whose annotation does not change much between the two Ensembl versions.
Why are all the other genes from the paper not picked up by RATs? For this we’ll focus on the gene-level predictions, in order to be able to sensibly plot specific properties.
# Gene IDs of genes reported by Deng et al., but not reported by RATs with either annotation.
ngids <- setdiff(pgids, union(rids60, rids87))
print(length(ngids))
## [1] 207
Is it the reproducibility?
# Ensembl 60
print(
ggplot(data= rat_ipf_60$Genes[ngids, .(parent_id, quant_dtu_freq)]) +
geom_histogram(aes(x=quant_dtu_freq), fill="darkgreen", breaks=seq(0,1,0.02)) +
xlab("Reproducibility") +
ylab("Genes") +
geom_vline(xintercept=rat_ipf_60$Parameters$quant_reprod_thresh, colour="grey50") +
theme(axis.line.x=element_line()) +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,100))
)
# Ensembl 87
print(
ggplot(data= rat_ipf_87$Genes[ngids, .(parent_id, quant_dtu_freq)]) +
geom_histogram(aes(x=quant_dtu_freq), fill="darkgreen", breaks=seq(0,1,0.02)) +
xlab("Reproducibility") +
ylab("Genes") +
geom_vline(xintercept=rat_ipf_87$Parameters$quant_reprod_thresh, colour="grey50") +
theme(axis.line.x=element_line()) +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,100))
)
## Warning: Removed 4 rows containing non-finite values (stat_bin).
With each annotation, the majority of genes reported by Deng at al. are reproducibly non-DTU, with several more genes spread out through the range of reproducibility indicating unstable quantifications.
Are they failing because of the p-values?
# Ensembl 60
print(
ggplot(data= rat_ipf_60$Genes[ngids, .(parent_id, pval_corr)]) +
geom_histogram(aes(x=pval_corr), fill="darkblue", breaks=seq(0,1,0.02)) +
xlab("Significance") +
ylab("Genes") +
geom_vline(xintercept=rat_ipf_60$Parameters$p_thresh, colour="grey50") +
theme(axis.line.x=element_line()) +
scale_y_continuous(expand=c(0,0), trans='sqrt') +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,250))
)
# Ensembl 87
print(
ggplot(data= rat_ipf_87$Genes[ngids, .(parent_id, pval_corr)]) +
geom_histogram(aes(x=pval_corr), fill="darkblue", breaks=seq(0,1,0.02)) +
xlab("Significance") +
ylab("Genes") +
geom_vline(xintercept=rat_ipf_87$Parameters$p_thresh, colour="grey50") +
theme(axis.line.x=element_line()) +
scale_y_continuous(expand=c(0,0), trans='sqrt') +
scale_x_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,250))
)
## Warning: Removed 4 rows containing non-finite values (stat_bin).
The vast majority of these genes are statistically significant, so this is not the limiting factor.
Are they failing because of the effect size then?
# Ensembl 60
print(
ggplot(data= rat_ipf_60$Genes[ngids, .(maxDprop, parent_id)]) +
geom_histogram(aes(x=maxDprop), fill="darkred", breaks=seq(-1,1,0.02)) +
xlab("Largest difference in isoform proportion") +
ylab("Genes") +
geom_vline(xintercept=rat_ipf_60$Parameters$dprop_thresh, colour="grey50") +
geom_vline(xintercept=-rat_ipf_60$Parameters$dprop_thresh, colour="grey50") +
theme(axis.line.x=element_line()) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,20))
)
# Ensembl 87
print(
ggplot(data= rat_ipf_87$Genes[ngids, .(maxDprop, parent_id)]) +
geom_histogram(aes(x=maxDprop), fill="darkred", breaks=seq(-1,1,0.02)) +
xlab("Largest difference in isoform proportion") +
ylab("Genes") +
geom_vline(xintercept=rat_ipf_87$Parameters$dprop_thresh, colour="grey50") +
geom_vline(xintercept=-rat_ipf_87$Parameters$dprop_thresh, colour="grey50") +
theme(axis.line.x=element_line()) +
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(ylim=c(0,20))
)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
Indeed, in the majority of genes reported by Deng et al, the largest change observed among their isoforms is smaller 20 percentile points of relative abundance, which is the limit used in our analysis.